set.seed(1234)suppressMessages(library(caret))load(file ='cancer_data_10features.RData')train_obs <-createDataPartition(y = cancer_data_10features$diagnosis, p = .80, list =FALSE)train <- cancer_data_10features[train_obs, 1:3]test <- cancer_data_10features[-train_obs, 1:3]# Confirm both training and test are balanced with respect to diagnosis (malign tumor)print(paste("Percentage of training data consisting of malign tumors:", 100*mean(train$diagnosis =="M")))
[1] "Percentage of training data consisting of malign tumors: 37.280701754386"
print(paste("Percentage of test data consisting of malign tumors:", 100*mean(test$diagnosis =="M")))
[1] "Percentage of test data consisting of malign tumors: 37.1681415929204"
# Train and test for each tumor classind_train_M <- train$diagnosis =="M"ind_test_M <- test$diagnosis =="M"plot(train[ind_train_M, 2], train[ind_train_M, 3], xlab ="Radius", ylab ="Texture", col ="cornflowerblue", xlim =c(5, 30), ylim =c(5, 40))lines(train[!ind_train_M, 2], train[!ind_train_M, 3], col ="lightcoral", type ="p")lines(test[ind_test_M, 2], test[ind_test_M, 3], col ="cornflowerblue", type ="p", pch ="x")lines(test[!ind_test_M, 2], test[!ind_test_M, 3], col ="lightcoral", type ="p", pch ="x")legend("topright", legend =c("Malign train", "Benign train", "Malign test", "Benign test"), col =c("cornflowerblue", "lightcoral", "cornflowerblue", "lightcoral"), pch =c("o", "o", "x", "x"))
💪 Problem 1.1
Derive (analytically) \(p(\mathbf{x})\) for the example above.
\[
\text{p(x) is given by : } p(x) = \sum_{m=1}^{2} p(x|y=m) \cdot Pr(y=m)
\]
\[
Pr(y=m) \text{ are the marginal probabilities which are } \pi_1, \pi_2
\]
\[
p(x|y = m) \text{ follows a normal distribution: }\\ p(x|y=m) = \frac{1}{(2\pi)^{d/2}|\boldsymbol{\Sigma}|^{1/2}} e^{-\frac{1}{2}(x - \mu_m)^T \boldsymbol{\Sigma}^{-1} (x - \mu_m)}
\]
\[
\text{ Starting with the first class } p(x|y=1) \text{ and substituting this into the equation } p(x|y=m)
\\p(x|y=1) = \frac{1}{(2\pi)^{d/2}|\boldsymbol{\Sigma}|^{1/2}} e^{-\frac{1}{2}(x - \mu_m)^T \boldsymbol{\Sigma}^{-1} (x - \mu_m)}
\]
\[
\text{ Multiplying the above by the prior probability } \pi_1
\\ \pi_1 \cdot p(x|y=1) = \pi_1 \cdot \frac{1}{(2\pi)^{d/2}|\boldsymbol{\Sigma}|^{1/2}} e^{-\frac{1}{2}(x - \mu_1)^T \boldsymbol{\Sigma}^{-1} (x - \mu_1)}
\]
\[
\text{Doing the same for the 2nd class } \pi_2 \cdot p(x|y=2) = \pi_2 \cdot \frac{1}{(2\pi)^{d/2}|\boldsymbol{\Sigma}|^{1/2}} e^{-\frac{1}{2}(x - \mu_2)^T \boldsymbol{\Sigma}^{-1} (x - \mu_2)}
\]
\[
\text{Combining the two of the above together results in the marginal probability of x: }
\\p(x) = \pi_1 \cdot \frac{1}{(2\pi)^{d/2}|\boldsymbol{\Sigma}|^{1/2}} e^{-\frac{1}{2}(x - \mu_1)^T \boldsymbol{\Sigma}^{-1} (x - \mu_1)} + \pi_2 \cdot \frac{1}{(2\pi)^{d/2}|\boldsymbol{\Sigma}|^{1/2}} e^{-\frac{1}{2}(x - \mu_2)^T \boldsymbol{\Sigma}^{-1} (x - \mu_2)}
\]
Problem 1.2
Given the training data, estimate the parameters \(\pi_m,\boldsymbol{\mu}_m\) for \(m=1,2\), and \(\boldsymbol{\Sigma}\). Predict the labels of the test data and plot them in a scatter plot with different colors to represent the different classes.
#parameters we need to determine are for 2 classes: 1 being malign and 1 for benign #for each class we need to predict the pi of class, mean of class and sigma matrix or pooled covariance matrix #using machine learning a first course for engineers pages 260 and 261 to help with the below#class 1 can be for malign#class 2 can be fore benignparameter_estimation =function(train_data) { number_obs =nrow(train_data) number_malign_obs =sum(train_data$diagnosis =="M") number_benign_obs=sum(train_data$diagnosis =="B")#calculating probabilities or pi pi_malign = number_malign_obs / number_obs #probability of malign pi_benign = number_benign_obs / number_obs #probability of benign#calculating mean or mu of both radius and texture (or features)#filtering the training data and calculating means malign = train_data[train_data$diagnosis =="M", -1] benign = train_data[train_data$diagnosis =="B", -1] mu_malign =colMeans(malign) mu_benign =colMeans(benign) malign =as.matrix(malign) benign =as.matrix(benign)#now calculating covariance matrix cov_malign =t(sweep(malign, 2, mu_malign, "-")) %*%sweep(malign, 2, mu_malign, "-") cov_benign =t(sweep(benign, 2, mu_benign, "-")) %*%sweep(benign, 2, mu_benign, "-") cov_matrix =1/number_obs * (cov_malign + cov_benign) parameters =list(pi_malign = pi_malign, pi_benign= pi_benign, mu_malign = mu_malign, mu_benign = mu_benign, cov_matrix = cov_matrix)return(parameters)}predict_diagnosis =function(test_data, parameters) { cov_matrix_invers =solve(parameters$cov_matrix)apply(test_data[, -1], 1, function(row) { score_malign =t(row) %*% cov_matrix_invers %*% parameters$mu_malign -0.5*t(parameters$mu_malign) %*% cov_matrix_invers %*% parameters$mu_malign +log(parameters$pi_malign) score_benign =t(row) %*% cov_matrix_invers %*% parameters$mu_benign -0.5*t(parameters$mu_benign) %*% cov_matrix_invers %*% parameters$mu_benign +log(parameters$pi_benign)return(ifelse(score_malign > score_benign, "M", "B")) })}estimated_parameters =parameter_estimation(train)y_hat_test_linear =predict_diagnosis(test, estimated_parameters)cat("\nThe estimated pi for malign is: ", estimated_parameters$pi_malign)
The estimated pi for malign is: 0.372807
cat("\nThe estimated pi for benign is: ", estimated_parameters$pi_benign)
The estimated pi for benign is: 0.627193
cat("\nThe estimated mu for malign is: ", estimated_parameters$mu_malign)
The estimated mu for malign is: 17.53441 21.62594
cat("\nThe estimated mu for benign is: ", estimated_parameters$mu_benign)
The estimated mu for benign is: 12.10734 17.86224
cat("\nThe covariance matrix is: ", estimated_parameters$cov_matrix)
The covariance matrix is: 6.177451 0.5499085 0.5499085 15.57662
ind_test_M = test$diagnosis =="M"ind_predicted_M = y_hat_test_linear =="M"plot(test[ind_test_M, 2], test[ind_test_M, 3], xlab ="Radius", ylab ="Texture", col ="black", xlim =c(5, 30), ylim =c(5, 40), main ="Diagnosed vs Predicted for test data")lines(test[!ind_test_M, 2], test[!ind_test_M, 3], col ="green",type ="p")lines(test[ind_predicted_M, 2], test[ind_predicted_M, 3], col ="cornflowerblue", type ="p", pch ="x")lines(test[!ind_predicted_M, 2], test[!ind_predicted_M, 3], col ="lightcoral",type ="p", pch ="x")legend("topright", legend =c("Diagnosed Malign", "Diagnosed Benign", "Predicted Malign", "Predicted Benign"), col =c("black", "green", "cornflowerblue", "lightcoral"), pch =c("o", "o", "x", "x"))
Problem 1.3
Plot the decision bound of the classifier and the predictions of the test data from Problem 1.2 in the same plot.
x1_grid <-seq(5, 30, length.out =100)cov_matrix_inv =solve(estimated_parameters$cov_matrix)gamma_1 = (cov_matrix_inv %*% (estimated_parameters$mu_malign - estimated_parameters$mu_benign))[1] / (cov_matrix_inv %*% (estimated_parameters$mu_malign - estimated_parameters$mu_benign))[2]gamma_0 =0.5* (t(estimated_parameters$mu_benign) %*% cov_matrix_inv %*% estimated_parameters$mu_benign -t(estimated_parameters$mu_malign) %*% cov_matrix_inv %*% estimated_parameters$mu_malign) +log(estimated_parameters$pi_malign) -log(estimated_parameters$pi_benign) - (cov_matrix_inv %*% (estimated_parameters$mu_malign + estimated_parameters$mu_benign))[1] * gamma_1#this extracts the scalar value from the matrix calc. gamm_0 is initially a matrix from the above calculation but it needs to be scalargamma_0 = gamma_0[1,1]x2_grid = gamma_0 + gamma_1 * x1_gridplot(test[ind_test_M, 2], test[ind_test_M, 3], xlab ="Radius", ylab ="Texture", col ="black", xlim =c(5, 30), ylim =c(5, 40), main ="Diagnosed vs Predicted for test data")lines(test[!ind_test_M, 2], test[!ind_test_M, 3], col ="green",type ="p")lines(test[ind_predicted_M, 2], test[ind_predicted_M, 3], col ="cornflowerblue", type ="p", pch ="x")lines(test[!ind_predicted_M, 2], test[!ind_predicted_M, 3], col ="lightcoral",type ="p", pch ="x")lines(x1_grid, x2_grid, col ="black", lty =2, lwd =2)legend("topright", legend =c("Diagnosed Malign", "Diagnosed Benign", "Predicted Malign", "Predicted Benign", "Decision Boundary"), col =c("black", "green", "cornflowerblue", "lightcoral", "black"), pch =c("o", "o", "x", "x", "-"))
Problem 1.4
Fit a logistic regression to the training data using the glm() function. Compare the results to the generative model in Problem 1.2. Comment on the results.
library(pROC)
Type 'citation("pROC")' for a citation.
Attaching package: 'pROC'
The following objects are masked from 'package:stats':
cov, smooth, var
glm_fit =glm(diagnosis ~ ., family = binomial, data = train)summary(glm_fit)
Call:
glm(formula = diagnosis ~ ., family = binomial, data = train)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -18.4908 1.8430 -10.033 < 2e-16 ***
radius 0.9962 0.1076 9.256 < 2e-16 ***
texture 0.1952 0.0395 4.943 7.7e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 602.31 on 455 degrees of freedom
Residual deviance: 242.76 on 453 degrees of freedom
AIC: 248.76
Number of Fisher Scoring iterations: 7
In problem 1.2, the accuracy of the model is 90.2%, with a sensitivity of 80.9% and specificity of 95.7%. The logistic regression model has an accuracy of 87.6%, sensitivity of 80.9% and specificity of 91.5%. Comparing these metrics it appears that the linear model is better at correctly identifying negative values i.e. benign relative to the logistic regression model. The logistic regression model has an accuracy of 87.6% which is worse then the linear model indicating that the linear model is better at predicting correct values both negative and positive. Finally, the linear model has a sensitivity of 80.9% vs 80.9% for the logistic model, each model has the same accuracy of predicting positive or malign values. Overall, I’d say the linear model is a better model then the logistic model as the linear model has higher accuracy and better at predicting negative values.
Based on the ROC, the linear discriminant model (in lightcoral) is closest to the top left hand corner of the plot and this also indicates that it is a better model.
2. Quadratic discriminant analysis for cancer data
💪 Problem 2.1
Derive (analytically) \(p(\mathbf{x})\) with the new class conditional distribution above.
\[
\text{The difference between problem 1.1 and 2.1 is that each class in a quadratic discriminant analysis have their own covariance matrix.}
\]
\[
\text{ For class 1, the conditional probability is: }
\]
Given the training data, estimate the parameters \(\pi_m,\boldsymbol{\mu}_m\), and \(\boldsymbol{\Sigma}_m\) for \(m=1,2\). Predict the labels of the test data and plot them in a scatter plot with different colors to represent the different classes.
pi_malign = estimated_parameters$pi_malignpi_benign = estimated_parameters$pi_benignmu_malign = estimated_parameters$mu_malignmu_benign = estimated_parameters$mu_benigncovariance_benign =cov(train[train$diagnosis =="B", ][, c("radius", "texture")])covariance_malign =cov(train[train$diagnosis =="M", ][, c("radius", "texture")])cat("\nThe estimated pi for malign is: ", pi_malign)
The estimated pi for malign is: 0.372807
cat("\nThe estimated pi for benign is: ", pi_benign)
The estimated pi for benign is: 0.627193
cat("\nThe estimated mu for malign is: ", mu_malign)
The estimated mu for malign is: 17.53441 21.62594
cat("\nThe estimated mu for benign is: ", mu_benign)
The estimated mu for benign is: 12.10734 17.86224
cat("\nThe covariance matrix for benign is: ", covariance_benign)
The covariance matrix for benign is: 3.269885 -0.2823108 -0.2823108 16.62193
cat("\nThe covariance matrix for malign is: ", covariance_malign)
The covariance matrix for malign is: 11.15385 1.959863 1.959863 13.99816
The linear model has an accuracy of 90.27%, quadratic 87.61% and logistic 87.61%. The linear model has the highest accuracy. The linear model has a sensitivity value of 80.95%, quadratic 80.95% and logistic 80.95%, all three models have the same sensitivity the linear model has a specificity of 95.77%, the logistic and quadratic 91.55%
Overall, it is interesting to note that the logistic and quadratic models have very similar results where they are exactly the same in every measurement indicating that they are very similar models. Overall, the linear model is the best out of the three as it outperforms in accuracy and specificity.
Looking at the ROC curve, the quadratic discriminant model is the worst of the three as it is the furthest from the left hand corner.
Problem 2.4
A doctor contacts you and says she has a patient whose digitised image has the features radius=13.20 and texture=19.22. Use your best classifier from Problem 2.3 to provide the doctor with some advice.
new_data =data.frame(diagnosis ="", radius =13.20, texture =19.22)predicted_diagnosis =predict_diagnosis(new_data, estimated_parameters)if(predicted_diagnosis =="M") { advice ="Malignant diagnosis"} else { advice ="Benign diagnosis"}cat("Based on the data the patient diagnosis is: ", advice)
Based on the data the patient diagnosis is: Benign diagnosis
par(mfrow=c(1,2))load(file ='asb.RData')plot(Close ~log(Volume), data = asb, col ="cornflowerblue", main ="ASB stock: Closing price vs log of trading volume")date <-as.Date(asb$Date)Close <- asb$Closeplot(date, Close, col ="cornflowerblue",type ="l", main ="ASB stock: Closing prices during 2014-12-26 to 2017-11-10")
A possible interpretation of the two clusters could be due to the change in the average stock price level depicted in the second graph. We can see a clear break in the stock price trend at the end of 2016 from around $18 to a new level of around $24. This is reflected in our cluster graph as we can see that majority of the trading occurred at 1 price level around $18 and the other cluster where trading occurred at $24
3.2
From the law of total probability, the marginal distribution is given by:
[1] "Estimated class 2 marginal probability: 0.777. True: 0.8"
# Normalised histogramhist(x, xlab ="x", col ="cornflowerblue", main ="Normalised histogram of the simulated x", prob =TRUE)
x_grid<-seq(-6,10,length.out=1000)x_mix <-rep(NA, length(x_grid))for(i in1:n){ x_mix[i] <-rnorm(n =1, mean =2.8, sd =sqrt(7.61))}hist(x, col=rgb(0,0,1,1/4), prob =TRUE, main ="Normalised Histograms of simulated x and p(x) ") hist(x_mix, col=rgb(1,0,0,1/4), breaks =30, prob =TRUE, add = T)
4. Unsupervised learning via the EM algorithm
The expectation-maximisation (EM) algorithm is a powerful algorithm to learn the parameters in unsupervised learning models. In addition, the EM algorithm gives us the conditional (on the training data) distribution of the class membership for each observation, which we can use for classification.
The following function implements the EM algorithm. The implementation assumes a single feature and two classes.
EM_GMM_M2 <-function(x, mu_start, sigma_start, pis_start, n_iter =100) {# Estimates the parameters in an unsupervised Gaussian mixture model with M = 2 classes. # Runs the EM algorithm for n_iter iterations. x is assumed univariate. # mu_start, sigma_start, pis_start are starting values.stopifnot(sum(pis_start) ==1)# Quantities to save pis_all <-matrix(NA, nrow = n_iter, ncol =2) mu_all <-matrix(NA, nrow = n_iter, ncol =2) sigma_all <-matrix(NA, nrow = n_iter, ncol =2) Q_all <-rep(NA, n_iter) log_like_all <-rep(NA, n_iter)# Initialise mu <- mu_start sigma <- sigma_start pis <- pis_start n <-length(x) W <-matrix(0, nrow = n, ncol =2) # Unnormalised weights for each observation log_pdf_class <-matrix(0, nrow = n, ncol =2) # The log-likelihood of the two classes for each obs. To compute Q. for(j in1:n_iter){# Start EM steps# E-step: Compute the expected log-likelihood Qfor(m in1:2){# The log-density for each class log_pdf_class[, m] <-dnorm(x, mean = mu[m], sd = sigma[m], log =TRUE) +log(pis[m])# Unnormalised weights W[, m] <- pis[m]*dnorm(x, mean = mu[m], sd = sigma[m]) } w <- W/rowSums(W) # Normalise weights n_hat <-colSums(w) # Expected number of obs per class Q <-sum(rowSums(w*log_pdf_class)) # Expected log-likelihood# M-step: Maximise Q. Closed form analytical solution in Gaussian mixture modelsfor(m in1:2){ pis[m] <- n_hat[m]/n mu[m] <-1/n_hat[m]*sum(w[, m]*x) sigma[m] <-sqrt(1/n_hat[m]*sum(w[, m]*(x - mu[m])^2)) }# End EM steps. Save estimates, Q, and log-likelihood pis_all[j, ] <- pis mu_all[j, ] <- mu sigma_all[j, ] <- sigma Q_all[j] <- Q# Compute log-likelihood at current parameter valuesfor(m in1:2){# Unnormalised weights W[, m] <- pis[m]*dnorm(x, mean = mu[m], sd = sigma[m]) } log_like_all[j] <-sum(log(rowSums(W))) } # End EM iterations# Return everything as a listreturn(list(pi_hat = pis, mu_hat = mu, sigma_hat = sigma, weights = W/rowSums(W), pis_all = pis_all, mu_all = mu_all, sigma_all = sigma_all, Q_all = Q_all, log_like_all = log_like_all))}
The following code uses the above function to estimate the parameters using our simulated data form Problem 3, performs some convergence checks, and plots the class posterior probability distribution for an observation. Note, that the log-likelihood is guaranteed to not decrease at any iteration.
matplot(0:n_iter, rbind(mu_start, EM_result$mu_all), main ='mu', pch =c("o", "o"), col =c("cornflowerblue", "lightcoral"), xlab ="Iteration", ylab ="mu", ylim =c(-3, 6))legend("topright", legend =c("Class 1", "Class 2"), col =c("cornflowerblue", "lightcoral"), pch =c("o", "o"))
matplot(0:n_iter, rbind(sigma_start, EM_result$sigma_all), main ='sigma', pch =c("o", "o"), col =c("cornflowerblue", "lightcoral"), xlab ="Iteration", ylab ="sigma", ylim =c(0, 4))legend("topright", legend =c("Class 1", "Class 2"), col =c("cornflowerblue", "lightcoral"), pch =c("o", "o"))
par(mfrow =c(1, 1))# Inspect convergenceplot(EM_result$log_like_all, main ='Log-likelihood', type ="l", col ="cornflowerblue", xlab ="Iteration", ylab ="Log-likelihood")
plot(EM_result$Q_all, main ='Expected log-likelihood (Q)', type ="l", col ="cornflowerblue", xlab ="Iteration", ylab ="Log-likelihood")
print("True parameters (pi, mu, and sigma)")
[1] "True parameters (pi, mu, and sigma)"
print(pis)
[1] 0.2 0.8
print(mu)
[1] -2 4
print(sigma)
[1] 0.5 1.5
print("Estimated parameters (pi, mu, and sigma)")
[1] "Estimated parameters (pi, mu, and sigma)"
print(EM_result$pi_hat)
[1] 0.2235894 0.7764106
print(EM_result$mu_hat)
[1] -2.046600 4.018808
print(EM_result$sigma_hat)
[1] 0.4857986 1.4369935
# Inspect the classification probabilities of observation 10ind <-10barplot(names.arg =c("Class 1", "Class 2"), EM_result$weights[ind, ], col ="cornflowerblue", ylim =c(0, 1), main =paste("Class (posterior) probability observation ", ind, sep =""))
Problem 4.1
The likelihood in training Gaussian mixture models has M! maxima, where M is the number of classes and each maxima corresponds to a permutation of class labels on the same clusters. Hence, Gaussian mixture models are just as likely to label a cluster as ‘1’, or ‘2’ for example. In the above case, we’d expect label switching in 50% of cases since there are two classes, however, there is no label switching observed.
The following code plots a (normalised) histogram of the insects data and highlights the feature values of observations 6, 244, and 421.
load(file='insects.RData')hist(insects$length, col ="cornflowerblue", main ="Histogram of insects' lengths", prob =TRUE, xlab ="Length", ylim =c(0, 0.4), xlim =c(0, 14))abline(v = insects[6, ], lwd =1.5, col ="lightcoral")abline(v = insects[244, ], lwd =1.5, col ="purple")abline(v = insects[421, ], lwd =1.5, col ="lightpink")legend("topright", legend =c("Obs 6", "Obs 244", "Obs 421"), col =c("lightcoral", "purple", "lightpink"), lwd =c(1.5, 1.5, 1.5))
Problem 4.2
Use the EM_GMM_M2() function to estimate an unsupervised Gaussian mixture model for the insect data in Problem 3. Analyse the convergence. Compare the parameter estimates to those obtained by the function normalmixEM() in the mixtools package.
The following code uses the EM_GMM_M2() function to estimate an unsupervised Gaussian mixture model for the insect data, and shows some convergence checks.
par(mfrow =c(1, 1))# Inspect convergenceplot(EM_result_insects$log_like_all, main ='Log-likelihood', type ="l",col ="cornflowerblue", xlab ="Iteration", ylab ="Log-likelihood")
plot(EM_result_insects$Q_all, main ='Expected log-likelihood (Q)', type ="l",col ="cornflowerblue", xlab ="Iteration", ylab ="Log-likelihood")
print("Estimated parameters (pi, mu, and sigma)")
[1] "Estimated parameters (pi, mu, and sigma)"
print(EM_result_insects$pi_hat)
[1] 0.7930843 0.2069157
print(EM_result_insects$mu_hat)
[1] 3.993440 8.100479
print(EM_result_insects$sigma_hat)
[1] 0.9888078 0.6709056
The following code implements a similar EM algorithm via the normalmixEM() function in the mixtools package.
library(mixtools)
mixtools package, version 2.0.0, Released 2022-12-04
This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772 and the Chan Zuckerberg Initiative: Essential Open Source Software for Science (Grant No. 2020-255193).
# using normalmixEMEM_object_insects <-normalmixEM(insects$length, pis_start, mu_start, sigma_start)
number of iterations= 28
print("Estimated parameters (pi, mu, and sigma)")
[1] "Estimated parameters (pi, mu, and sigma)"
print(EM_object_insects$lambda)
[1] 0.7935566 0.2064434
print(EM_object_insects$mu)
[1] 3.994984 8.103936
print(EM_object_insects$sigma)
[1] 0.9905859 0.6674937
For the insect data, the EM algorithm converges in 28 iterations, with no improvements in the log-likelihood for the last few computed iterations. Even so, the class 1 parameters settled quite quickly, while the class 2 parameters settled towards the end of the optimisation.
The ‘normalmixEM’ function is a package implementation of the EM algorithm for GMMs. This is demonstrated by the similarity in estimated parameters, and convergence time for the ‘normalmixEM’ function, when compared to the implemented EM algorithm for the fitting of a GMM. The parameter estimates are almost identical between the two implementations, and the number of iterations for convergence is 28 for each.
Problem 4.3
The class posterior probabilities for insects 6, 244, and 421 are plotted below.
# Inspect the classification probabilities of observation 10obs <-c(6, 244, 421)for (i in1:3) {barplot(names.arg =c("Class 1", "Class 2"), EM_result_insects$weights[obs[i], ],col ="cornflowerblue", ylim =c(0, 1),main =paste("Class (posterior) probability observation ", obs[i], sep ="" ) )}
The results are as expected, with observations 6, and 421 having more certain prediction in line with their proximity to the predicted gaussian distributions for each class. Observation 244, which sits on the border of the two distributions, has a posterior probability of around 0.25 for class 1, and 0.75 for class 2, showing the uncertainty in the predicted distribution it belongs to.
Problem 4.4
A general EM algorithm is implemented below, for more than 2 classes, and then used for modelling on the fish data.
# rewrite EM algorithm with any number of componentsEM_GMM_general <-function(x, mu_start, sigma_start, pis_start, n_iter=100) {stopifnot(sum(pis_start) ==1)stopifnot(length(mu_start) ==length(sigma_start))stopifnot(length(mu_start) ==length(pis_start)) M <-length(mu_start) # number of components is length of provided vectors pis_all <-matrix(NA, nrow=n_iter, ncol=M) mu_all <-matrix(NA, nrow=n_iter, ncol=M) sigma_all <-matrix(NA, nrow=n_iter, ncol=M) Q_all <-rep(NA, n_iter) log_like_all <-rep(NA, n_iter)# Initialise mu <- mu_start sigma <- sigma_start pis <- pis_start n <-length(x) W <-matrix(0, nrow = n, ncol = M) log_pdf_class <-matrix(0, nrow = n, ncol = M) # the log-likelihood to compute Qfor (j in1:n_iter) {# Start EM steps# E-step: Compute the expected log-likelihood Qfor (m in1:M) { log_pdf_class[, m] <- (dnorm(x, mean = mu[m], sd=sigma[m], log =TRUE) +log(pis[m])) W[, m] <- pis[m]*dnorm(x, mean = mu[m], sd = sigma[m]) } w <- W/rowSums(W) # normalise weights n_hat <-colSums(w) Q <-sum(rowSums(w*log_pdf_class)) # expected log-likelihood# M-step: Maximise Qfor (m in1:M) { pis[m] <- n_hat[m] / n mu[m] <-1/n_hat[m]*sum(w[, m]*x) sigma[m] <-sqrt(1/n_hat[m]*sum(w[, m]*(x - mu[m])^2)) }# End EM steps, save estimates, Q, and log-likelihood pis_all[j, ] <- pis mu_all[j, ] <- mu sigma_all[j, ] <- sigma Q_all[j] <- Q# log-likelihood at current parameter valuesfor (m in1:M) { W[, m] <- pis[m]*dnorm(x, mean = mu[m], sd = sigma[m]) } log_like_all[j] <-sum(log(rowSums(W))) } # End Em iterations# Return everything as a listreturn(list(pi_hat = pis, mu_hat = mu, sigma_hat = sigma, weights = W/rowSums(W), pis_all = pis_all, mu_all = mu_all, sigma_all = sigma_all, Q_all = Q_all, log_like_all = log_like_all))}
load("fish.RData")# Take a look at the histogram first before implementinghist(fish$length, col ="cornflowerblue", breaks =30, prob =TRUE,main ="Histogram of fish lengths", xlab ="Length")
# 3 Classesmu_start_3classes <-c(23, 35, 60) # from histogramsigma_start_3classes <-c(1, 1, 1)pis_start_3classes <-c(0.2, 0.6, 0.2) # from proportions in histogramn_iter_3classes <-60EM_fish_3classes <-EM_GMM_general(fish$length, mu_start_3classes, sigma_start_3classes, pis_start_3classes,n_iter=n_iter_3classes)# 4 classesmu_start_4classes <-c(23, 33, 39, 60) # accounting for skew in histogramsigma_start_4classes <-c(1, 1, 1, 1)pis_start_4classes <-c(0.1, 0.4, 0.4, 0.1)n_iter_4classes <-60EM_fish_4classes <-EM_GMM_general(fish$length, mu_start_4classes, sigma_start_4classes, pis_start_4classes,n_iter=n_iter_4classes)
# Visualise convergence for each parameter (3 Classes)# Could create a function to do this to tidy notebookmatplot(0:n_iter_3classes,rbind(pis_start_3classes, EM_fish_3classes$pis_all), main='pis',pch=c("o", "o", "o"), col=c("cornflowerblue", "lightcoral", "forestgreen"),xlab ="Iteration", ylab ="pis")legend("topright", legend =c("Class 1", "Class 2", "Class 3"),col=c("cornflowerblue", "lightcoral", "forestgreen"), pch=c("o", "o", "o"))
par(mfrow =c(1, 1))# Inspect convergenceplot(EM_fish_4classes$log_like_all, main ='Log-likelihood', type ="l",col ="cornflowerblue", xlab ="Iteration", ylab ="Log-likelihood")
plot(EM_fish_4classes$Q_all, main ='Expected log-likelihood (Q)', type ="l",col ="cornflowerblue", xlab ="Iteration", ylab ="Log-likelihood")
print("Estimated parameters (pi, mu, and sigma)")
[1] "Estimated parameters (pi, mu, and sigma)"
print(EM_fish_4classes$pi_hat)
[1] 0.08126274 0.43110931 0.32913948 0.15848847
print(EM_fish_4classes$mu_hat)
[1] 22.52155 33.35969 39.54630 51.07660
print(EM_fish_4classes$sigma_hat)
[1] 1.816024 3.626346 6.083748 9.848256
Problem 4.5
# first histogram for 3 class GMMpar(mfrow=c(1, 2))n <-1000sim_probs <-runif(n=1000) # using simulated probabilities for distributionx <-rep(NA, n)x2 <-rep(NA, n)for(i in1:n){if (sim_probs[i] < EM_fish_3classes$pi_hat[1]) { x[i] <-rnorm(n =1, mean = EM_fish_3classes$mu_hat[1],sd = EM_fish_3classes$sigma_hat[1]) }elseif (sim_probs[i] < EM_fish_3classes$pi_hat[1] + EM_fish_3classes$pi_hat[2]) { x[i] <-rnorm(n =1, mean = EM_fish_3classes$mu_hat[2],sd = EM_fish_3classes$sigma_hat[2]) }else { x[i] <-rnorm(n =1, mean = EM_fish_3classes$mu_hat[3],sd = EM_fish_3classes$sigma_hat[3]) }}hist(x, xlab ="length", breaks =30, col ="cornflowerblue",main ="Histogram for 3 Class GMM", prob =TRUE)# second histogram for 4 class GMMfor(i in1:n) {if (sim_probs[i] < EM_fish_4classes$pi_hat[1]) { x2[i] <-rnorm(n =1, mean = EM_fish_4classes$mu_hat[1],sd = EM_fish_4classes$sigma_hat[1]) }elseif (sim_probs[i] < EM_fish_4classes$pi_hat[1] + EM_fish_4classes$pi_hat[2]) { x2[i] <-rnorm(n =1, mean = EM_fish_4classes$mu_hat[2],sd = EM_fish_4classes$sigma_hat[2]) }elseif (sim_probs[i] < (EM_fish_4classes$pi_hat[1] + EM_fish_4classes$pi_hat[2]+ EM_fish_4classes$pi_hat[3])) { x2[i] <-rnorm(n =1, mean = EM_fish_4classes$mu_hat[3],sd = EM_fish_4classes$sigma_hat[3]) }else { x2[i] <-rnorm(n =1, mean = EM_fish_4classes$mu_hat[4],sd = EM_fish_4classes$sigma_hat[4]) }}hist(x2, xlab ="length", breaks =30, col ="cornflowerblue",main ="Histogram for 4 class GMM", prob =TRUE)
The GMM model with 4 classes seems better for modelling the fish lengths, as it captures the distribution of the fish length data at the mode more accurately.
Problem 4.6
The following code implements an unsupervised multivariate Gaussian mixture model on the ASB stock data, and a scatter plot is outputted with predicted classes using the posterior class probabilities and a 0.5 probability threshold.
# Training dataX_train <- asb$CloseX_train <-cbind(X_train, log(asb$Volume))colnames(X_train) <-c("Close", "log_vol")# Initialising starting parameterspis_start <-c(0.5, 0.5)mu_start <-list(c(18, 13.5), c(24, 13.5))sigma_start <-list(diag(1, 2), diag(1, 2))# using mvnormalmixEM for a Gaussian mixture modelEM_result_asb <-mvnormalmixEM(X_train, lambda=pis_start,mu=mu_start, sigma=sigma_start)
number of iterations= 7
# use the posterior class probabilities to plot predicted classesy_prob_hat <- EM_result_asb$posterior[, "comp.1"]X_train <-cbind(y_prob_hat, X_train)X_train_clust1 <- X_train[y_prob_hat >0.5, ] # 0.5 prediction thresholdX_train_clust2 <- X_train[y_prob_hat <=0.5, ]plot(X_train_clust1[, "log_vol"], X_train_clust1[, "Close"], col="cornflowerblue",xlab ="log(Volume)", ylab ="Close", ylim =c(14, 27),main ="ASB Stock Close vs log(Volume) with Predicted Classes")lines(X_train_clust2[, "log_vol"], X_train_clust2[, "Close"], col="lightcoral",type="p")legend("topright", legend =c("Class 1", "Class 2"),col=c("cornflowerblue", "lightcoral"), pch=c("o", "o"))
There are slight differences in the estimated parameters from the supervised and semi-supervised models, the most significant difference is in the estimation of the sigma values. This is to be expected as the data set for the supervised case with no missing values had labelled points which were more tightly spread around their respective clusters, whereas the the semi supervised algorithm incorporated the missing data points which were spread out further from the cluster means, resulting in the estimated distributions have larger SD values to account for this spread out data.
Both the supervised and semi-supervised algorithms attain the same accuracy on the test data. This is due to the fact that there are two points classified differently by each of the models, but since the true values of the respective points are different neither models accuracy can be higher than the other.